03. Training and Test¶
Train/validate and test different models for each resampling strategy.
01. Imports and Settings¶
In [1]:
# Imports
from catboost import CatBoostClassifier
from IPython.display import display
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import os
import pandas as pd
import sys
import warnings
# libs
sys.path.append(os.path.abspath(os.path.abspath(
os.path.join(os.path.expanduser("~") + "/remy-project/")))) # path
from libs.hyper_optimization import (
HyperParamCatBoostClassifier,
HyperParamLightGBMClassifier,
HyperParamXGBoostClassifier,
HyperParamRandomForestClassifier
) # hyper_optimization.py
from libs.model_evaluation import (
plot_roc_curve,
plot_pr_curve,
train_validate_model,
test_model,
plot_feature_importances,
plot_weights
) # model_evaluation.py
from libs.utils import split_X_y, save_object # utils.py
# Ignore warnings
warnings.filterwarnings("ignore")
# Pandas settings
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.max_colwidth", 150) # Increase column width
# Default path
DATA_PATH = "data/"
BIN_PATH = "bin/"
REPORTS_PATH = "reports/"
02. Load Data¶
In [2]:
# Dictionary to save models and their metrics
resampled = dict()
# Initializing for each resampling technique
for strategy in ["ONLY_RUS", "ROS", "SMOTENC", "CTGAN"]:
resampled[strategy] = dict()
resampled[strategy]["data"] = pd.DataFrame()
for clf in ["rf", "xgb", "lgb", "catboost", "xgb_boruta"]: # For each classifier type...
resampled[strategy][clf] = {"obj": object(), "train_metrics": {}, "test_metrics": {}}
In [3]:
# # Load training data from distinct resampling techniques
resampled["ONLY_RUS"]["data"] = pd.read_csv(f"{DATA_PATH}resampling/train_data_ONLY_RUS.csv")
resampled["ROS"]["data"] = pd.read_csv(f"{DATA_PATH}resampling/train_data_ROS.csv")
resampled["SMOTENC"]["data"] = pd.read_csv(f"{DATA_PATH}resampling/train_data_SMOTENC.csv")
resampled["CTGAN"]["data"] = pd.read_csv(f"{DATA_PATH}resampling/train_data_CTGAN.csv")
# Test data
test_df = pd.read_csv(f"{DATA_PATH}test_data.csv")
# Split test data into X and y
X_test, y_test = split_X_y(test_df, "is_target", [])
In [4]:
# Categorial columns
categorical_cols = ["join_s", "sch_s", "sch_r"]
# Numerical columns
numerical_cols = ["adv_r", "data_s", "dist_ch_to_bs", "dist_to_ch",
"expaned_energy", "rank", "send_code", "who_ch"]
features = categorical_cols + numerical_cols
# Selected features in Boruta
boruta_features = ["join_s", "who_ch", "data_s", "expaned_energy", "dist_ch_to_bs", "send_code", "rank", "adv_r"]
03. Train Models¶
03.1. CatBoost¶
In [5]:
for strategy in ["ONLY_RUS", "ROS", "SMOTENC", "CTGAN"]: # For each resampling strategy...
print(f"\n>> {strategy}\n")
# Split into X and y
X_train, y_train = split_X_y(resampled[strategy]["data"], "is_target", [])
# Optimizing hyperparameters...
model, best_hyperparams, _ = HyperParamCatBoostClassifier(
X_train, y_train, n_trials=10).run()
# Cross-Validation with k = 5
resampled[strategy]["catboost"]["train_metrics"] = train_validate_model(model, X_train, y_train)
# Training model...
model = CatBoostClassifier(**best_hyperparams, cat_features=categorical_cols, logging_level="Silent")
model.fit(X_train, y_train)
# Model testing
resampled[strategy]["catboost"]["test_metrics"] = test_model(model, X_test, y_test)
print("\n")
# ROC and PR Curves
plot_roc_curve(model, X_test, y_test)
print("\n")
plot_pr_curve(model, X_test, y_test)
# Save trained model
resampled[strategy]["catboost"]["obj"] = model
save_object(resampled[strategy]["catboost"]["obj"], f"{BIN_PATH}catboost_{strategy}_obj")
print("\n")
# Feature Importance
plot_feature_importances(resampled[strategy]["catboost"]["obj"], features, top_n=len(features))
# Weights
plot_weights(resampled[strategy]["catboost"]["obj"], features, top_n=len(features))
print("\n\n\n")
>> ONLY_RUS
[CatBoostClassifier] Optimizing: 10/10...
Training and Validation: {'Precision': '97.1 +- 0.15', 'Recall': '97.15 +- 0.23', 'F1': '97.12 +- 0.18', 'ROC_AUC': '99.9 +- 0.01', 'Log_Loss': '0.0584 +- 0.0031'}
Model Test: {'Precision': "81.04, {'0': 100.0, '1': 67.88, '2': 92.49, '3': 63.79}", 'Recall': "98.23, {'0': 97.25, '1': 99.76, '2': 99.85, '3': 96.07}", 'F1': "88.02, {'0': 98.61, '1': 80.79, '2': 96.03, '3': 76.67}", 'ROC_AUC': '99.91', 'Log_Loss': '0.116'}
| Weight | Feature |
|---|---|
| 0.3014 | expaned_energy |
| 0.1890 | dist_ch_to_bs |
| 0.1755 | adv_r |
| 0.1410 | who_ch |
| 0.0798 | sch_s |
| 0.0459 | rank |
| 0.0284 | data_s |
| 0.0122 | sch_r |
| 0.0102 | join_s |
| 0.0100 | dist_to_ch |
| 0.0066 | send_code |
>> ROS
[CatBoostClassifier] Optimizing: 10/10...
Training and Validation: {'Precision': '97.96 +- 0.1', 'Recall': '97.96 +- 0.17', 'F1': '97.95 +- 0.11', 'ROC_AUC': '99.93 +- 0.01', 'Log_Loss': '0.054 +- 0.0031'}
Model Test: {'Precision': "80.74, {'0': 100.0, '1': 66.41, '2': 92.1, '3': 64.46}", 'Recall': "98.46, {'0': 97.1, '1': 99.76, '2': 99.7, '3': 97.28}", 'F1': "87.89, {'0': 98.53, '1': 79.74, '2': 95.75, '3': 77.54}", 'ROC_AUC': '99.9', 'Log_Loss': '0.1295'}
| Weight | Feature |
|---|---|
| 0.3043 | expaned_energy |
| 0.2053 | dist_ch_to_bs |
| 0.1676 | adv_r |
| 0.1395 | who_ch |
| 0.0811 | sch_s |
| 0.0370 | rank |
| 0.0247 | data_s |
| 0.0137 | dist_to_ch |
| 0.0107 | sch_r |
| 0.0083 | join_s |
| 0.0078 | send_code |
>> SMOTENC
[CatBoostClassifier] Optimizing: 10/10...
Training and Validation: {'Precision': '97.6 +- 0.14', 'Recall': '97.56 +- 0.28', 'F1': '97.57 +- 0.17', 'ROC_AUC': '99.91 +- 0.01', 'Log_Loss': '0.0598 +- 0.0024'}
Model Test: {'Precision': "77.76, {'0': 100.0, '1': 64.76, '2': 91.69, '3': 54.59}", 'Recall': "98.18, {'0': 96.68, '1': 99.73, '2': 99.35, '3': 96.98}", 'F1': "85.52, {'0': 98.31, '1': 78.53, '2': 95.37, '3': 69.86}", 'ROC_AUC': '99.88', 'Log_Loss': '0.1359'}
| Weight | Feature |
|---|---|
| 0.3071 | expaned_energy |
| 0.2100 | dist_ch_to_bs |
| 0.1622 | adv_r |
| 0.1402 | who_ch |
| 0.0756 | sch_s |
| 0.0385 | rank |
| 0.0245 | data_s |
| 0.0158 | send_code |
| 0.0114 | dist_to_ch |
| 0.0079 | join_s |
| 0.0068 | sch_r |
>> CTGAN
[CatBoostClassifier] Optimizing: 10/10...
Training and Validation: {'Precision': '97.52 +- 0.1', 'Recall': '97.35 +- 0.26', 'F1': '97.43 +- 0.18', 'ROC_AUC': '99.91 +- 0.01', 'Log_Loss': '0.0573 +- 0.0026'}
Model Test: {'Precision': "79.57, {'0': 100.0, '1': 70.04, '2': 89.69, '3': 58.57}", 'Recall': "97.98, {'0': 97.24, '1': 99.73, '2': 99.95, '3': 95.02}", 'F1': "86.97, {'0': 98.6, '1': 82.29, '2': 94.54, '3': 72.47}", 'ROC_AUC': '99.9', 'Log_Loss': '0.1152'}
| Weight | Feature |
|---|---|
| 0.2818 | dist_ch_to_bs |
| 0.2470 | expaned_energy |
| 0.1367 | adv_r |
| 0.1075 | who_ch |
| 0.0862 | dist_to_ch |
| 0.0769 | sch_s |
| 0.0263 | rank |
| 0.0197 | data_s |
| 0.0072 | join_s |
| 0.0069 | sch_r |
| 0.0039 | send_code |
03.2. LightGBM¶
In [6]:
for strategy in ["ONLY_RUS", "ROS", "SMOTENC", "CTGAN"]: # For each resampling strategy...
print(f"\n>> {strategy}\n")
# Split into X and y
X_train, y_train = split_X_y(resampled[strategy]["data"], "is_target", [])
# Optimizing hyperparameters...
model, best_hyperparams, _ = HyperParamLightGBMClassifier(
X_train, y_train, n_trials=10).run()
# Cross-Validation with k = 5
resampled[strategy]["lgb"]["train_metrics"] = train_validate_model(model, X_train, y_train)
# Training model...
model = LGBMClassifier(**best_hyperparams, verbosity=-1)
model.fit(X_train, y_train)
# Model testing
resampled[strategy]["lgb"]["test_metrics"] = test_model(model, X_test, y_test)
print("\n")
# ROC and PR Curves
plot_roc_curve(model, X_test, y_test)
print("\n")
plot_pr_curve(model, X_test, y_test)
# Save trained model
resampled[strategy]["lgb"]["obj"] = model
save_object(resampled[strategy]["lgb"]["obj"], f"{BIN_PATH}lgb_{strategy}_obj")
print("\n")
# Feature Importance
plot_feature_importances(resampled[strategy]["lgb"]["obj"], features, top_n=len(features))
# Weights
plot_weights(resampled[strategy]["lgb"]["obj"], features, top_n=len(features))
print("\n\n\n")
>> ONLY_RUS
[LGBMClassifier] Optimizing: 10/10...
Training and Validation: {'Precision': '97.75 +- 0.15', 'Recall': '97.9 +- 0.26', 'F1': '97.82 +- 0.19', 'ROC_AUC': '99.94 +- 0.01', 'Log_Loss': '0.0446 +- 0.0037'}
Model Test: {'Precision': "87.38, {'0': 100.0, '1': 67.24, '2': 99.4, '3': 82.9}", 'Recall': "98.28, {'0': 97.78, '1': 99.69, '2': 98.96, '3': 96.68}", 'F1': "91.91, {'0': 98.87, '1': 80.31, '2': 99.18, '3': 89.26}", 'ROC_AUC': '99.94', 'Log_Loss': '0.1032'}
| Weight | Feature |
|---|---|
| 0.4064 | rank |
| 0.3376 | expaned_energy |
| 0.0815 | dist_ch_to_bs |
| 0.0732 | adv_r |
| 0.0299 | sch_s |
| 0.0295 | join_s |
| 0.0257 | who_ch |
| 0.0158 | data_s |
| 0.0002 | sch_r |
| 0.0001 | send_code |
| 0 | dist_to_ch |
>> ROS
[LGBMClassifier] Optimizing: 10/10...
Training and Validation: {'Precision': '98.57 +- 0.17', 'Recall': '98.6 +- 0.17', 'F1': '98.58 +- 0.15', 'ROC_AUC': '99.95 +- 0.01', 'Log_Loss': '0.0383 +- 0.0031'}
Model Test: {'Precision': "87.05, {'0': 100.0, '1': 66.71, '2': 99.5, '3': 81.99}", 'Recall': "98.34, {'0': 97.71, '1': 99.76, '2': 98.91, '3': 96.98}", 'F1': "91.71, {'0': 98.84, '1': 79.96, '2': 99.2, '3': 88.86}", 'ROC_AUC': '99.94', 'Log_Loss': '0.1098'}
| Weight | Feature |
|---|---|
| 0.3760 | rank |
| 0.3591 | expaned_energy |
| 0.0865 | dist_ch_to_bs |
| 0.0778 | adv_r |
| 0.0320 | sch_s |
| 0.0284 | who_ch |
| 0.0256 | join_s |
| 0.0144 | data_s |
| 0.0002 | sch_r |
| 0.0000 | send_code |
| 0.0000 | dist_to_ch |
>> SMOTENC
[LGBMClassifier] Optimizing: 10/10...
Training and Validation: {'Precision': '98.14 +- 0.11', 'Recall': '98.12 +- 0.2', 'F1': '98.13 +- 0.12', 'ROC_AUC': '99.93 +- 0.01', 'Log_Loss': '0.0468 +- 0.0037'}
Model Test: {'Precision': "84.82, {'0': 100.0, '1': 63.72, '2': 98.81, '3': 76.73}", 'Recall': "98.28, {'0': 97.33, '1': 99.28, '2': 99.4, '3': 97.13}", 'F1': "90.28, {'0': 98.64, '1': 77.62, '2': 99.11, '3': 85.73}", 'ROC_AUC': '99.89', 'Log_Loss': '0.1259'}
| Weight | Feature |
|---|---|
| 0.3771 | rank |
| 0.3544 | expaned_energy |
| 0.0880 | dist_ch_to_bs |
| 0.0780 | adv_r |
| 0.0304 | sch_s |
| 0.0296 | who_ch |
| 0.0277 | join_s |
| 0.0146 | data_s |
| 0.0001 | sch_r |
| 0.0001 | send_code |
| 0 | dist_to_ch |
>> CTGAN
[LGBMClassifier] Optimizing: 10/10...
Training and Validation: {'Precision': '98.0 +- 0.13', 'Recall': '98.01 +- 0.2', 'F1': '98.0 +- 0.13', 'ROC_AUC': '99.94 +- 0.01', 'Log_Loss': '0.043 +- 0.0027'}
Model Test: {'Precision': "86.24, {'0': 100.0, '1': 67.28, '2': 99.4, '3': 78.27}", 'Recall': "98.3, {'0': 97.71, '1': 99.69, '2': 98.96, '3': 96.83}", 'F1': "91.23, {'0': 98.84, '1': 80.34, '2': 99.18, '3': 86.56}", 'ROC_AUC': '99.94', 'Log_Loss': '0.1011'}
| Weight | Feature |
|---|---|
| 0.3466 | rank |
| 0.3019 | expaned_energy |
| 0.1515 | dist_ch_to_bs |
| 0.0702 | adv_r |
| 0.0367 | sch_s |
| 0.0347 | who_ch |
| 0.0279 | join_s |
| 0.0153 | data_s |
| 0.0147 | dist_to_ch |
| 0.0003 | sch_r |
| 0.0001 | send_code |
03.3. XGBoost¶
In [7]:
for strategy in ["ONLY_RUS", "ROS", "SMOTENC", "CTGAN"]: # For each resampling strategy...
print(f"\n>> {strategy}\n")
# Split into X and y
X_train, y_train = split_X_y(resampled[strategy]["data"], "is_target", [])
# Optimizing hyperparameters...
model, best_hyperparams, _ = HyperParamXGBoostClassifier(
X_train, y_train, n_trials=10).run()
# Cross-Validation with k = 5
resampled[strategy]["xgb"]["train_metrics"] = train_validate_model(model, X_train, y_train)
# Training model...
model = XGBClassifier(**best_hyperparams)
model.fit(X_train, y_train)
# Model testing
resampled[strategy]["xgb"]["test_metrics"] = test_model(model, X_test, y_test)
print("\n")
# ROC and PR Curves
plot_roc_curve(model, X_test, y_test)
print("\n")
plot_pr_curve(model, X_test, y_test)
# Save trained model
resampled[strategy]["xgb"]["obj"] = model
save_object(resampled[strategy]["xgb"]["obj"], f"{BIN_PATH}xgb_{strategy}_obj")
print("\n")
# Feature Importance
plot_feature_importances(resampled[strategy]["xgb"]["obj"], features, top_n=len(features))
# Weights
plot_weights(resampled[strategy]["xgb"]["obj"], features, top_n=len(features))
print("\n\n\n")
>> ONLY_RUS
[XGBClassifier] Optimizing: 10/10...
Training and Validation: {'Precision': '97.65 +- 0.25', 'Recall': '97.73 +- 0.23', 'F1': '97.69 +- 0.23', 'ROC_AUC': '99.93 +- 0.01', 'Log_Loss': '0.0482 +- 0.0039'}
Model Test: {'Precision': "88.16, {'0': 99.99, '1': 67.03, '2': 99.5, '3': 86.12}", 'Recall': "98.22, {'0': 97.8, '1': 99.76, '2': 98.81, '3': 96.53}", 'F1': "92.31, {'0': 98.88, '1': 80.19, '2': 99.15, '3': 91.03}", 'ROC_AUC': '99.94', 'Log_Loss': '0.1068'}
| Weight | Feature |
|---|---|
| 0.9163 | rank |
| 0.0308 | join_s |
| 0.0254 | data_s |
| 0.0105 | sch_s |
| 0.0104 | expaned_energy |
| 0.0030 | dist_ch_to_bs |
| 0.0026 | adv_r |
| 0.0007 | who_ch |
| 0.0004 | sch_r |
| 0 | send_code |
| 0 | dist_to_ch |
>> ROS
[XGBClassifier] Optimizing: 10/10...
Training and Validation: {'Precision': '98.53 +- 0.15', 'Recall': '98.56 +- 0.22', 'F1': '98.54 +- 0.18', 'ROC_AUC': '99.95 +- 0.01', 'Log_Loss': '0.0391 +- 0.0044'}
Model Test: {'Precision': "84.96, {'0': 99.99, '1': 66.77, '2': 99.5, '3': 73.59}", 'Recall': "98.05, {'0': 97.6, '1': 99.76, '2': 98.46, '3': 96.37}", 'F1': "90.3, {'0': 98.78, '1': 80.0, '2': 98.97, '3': 83.45}", 'ROC_AUC': '99.94', 'Log_Loss': '0.1141'}
| Weight | Feature |
|---|---|
| 0.9255 | rank |
| 0.0252 | join_s |
| 0.0200 | data_s |
| 0.0114 | sch_s |
| 0.0106 | expaned_energy |
| 0.0032 | dist_ch_to_bs |
| 0.0029 | adv_r |
| 0.0007 | who_ch |
| 0.0005 | sch_r |
| 0 | send_code |
| 0 | dist_to_ch |
>> SMOTENC
[XGBClassifier] Optimizing: 10/10...
Training and Validation: {'Precision': '98.06 +- 0.19', 'Recall': '97.97 +- 0.18', 'F1': '98.01 +- 0.14', 'ROC_AUC': '99.93 +- 0.01', 'Log_Loss': '0.0494 +- 0.0031'}
Model Test: {'Precision': "83.75, {'0': 100.0, '1': 65.15, '2': 98.76, '3': 71.08}", 'Recall': "98.31, {'0': 97.38, '1': 99.25, '2': 99.35, '3': 97.28}", 'F1': "89.63, {'0': 98.67, '1': 78.66, '2': 99.06, '3': 82.14}", 'ROC_AUC': '99.87', 'Log_Loss': '0.1342'}
| Weight | Feature |
|---|---|
| 0.9219 | rank |
| 0.0304 | join_s |
| 0.0200 | data_s |
| 0.0107 | sch_s |
| 0.0097 | expaned_energy |
| 0.0033 | dist_ch_to_bs |
| 0.0029 | adv_r |
| 0.0007 | who_ch |
| 0.0003 | sch_r |
| 0.0001 | dist_to_ch |
| 0 | send_code |
>> CTGAN
[XGBClassifier] Optimizing: 10/10...
Training and Validation: {'Precision': '97.93 +- 0.14', 'Recall': '97.74 +- 0.21', 'F1': '97.83 +- 0.16', 'ROC_AUC': '99.93 +- 0.01', 'Log_Loss': '0.0471 +- 0.0036'}
Model Test: {'Precision': "87.15, {'0': 99.99, '1': 67.5, '2': 99.45, '3': 81.67}", 'Recall': "98.16, {'0': 97.79, '1': 99.76, '2': 98.86, '3': 96.22}", 'F1': "91.72, {'0': 98.88, '1': 80.52, '2': 99.15, '3': 88.35}", 'ROC_AUC': '99.94', 'Log_Loss': '0.103'}
| Weight | Feature |
|---|---|
| 0.9246 | rank |
| 0.0299 | join_s |
| 0.0189 | data_s |
| 0.0102 | sch_s |
| 0.0079 | expaned_energy |
| 0.0042 | dist_ch_to_bs |
| 0.0024 | adv_r |
| 0.0009 | dist_to_ch |
| 0.0007 | who_ch |
| 0.0003 | sch_r |
| 0.0000 | send_code |
03.4. Random Forest¶
In [8]:
for strategy in ["ONLY_RUS", "ROS", "SMOTENC", "CTGAN"]: # For each resampling strategy...
print(f"\n>> {strategy}\n")
# Split into X and y
X_train, y_train = split_X_y(resampled[strategy]["data"], "is_target", [])
# Optimizing hyperparameters...
model, best_hyperparams, _ = HyperParamRandomForestClassifier(
X_train, y_train, n_trials=10).run()
# Cross-Validation with k = 5
resampled[strategy]["rf"]["train_metrics"] = train_validate_model(model, X_train, y_train)
# Training model...
model = RandomForestClassifier(**best_hyperparams)
model.fit(X_train, y_train)
# Model testing
resampled[strategy]["rf"]["test_metrics"] = test_model(model, X_test, y_test)
print("\n")
# ROC and PR Curves
plot_roc_curve(model, X_test, y_test)
print("\n")
plot_pr_curve(model, X_test, y_test)
# Save trained model
resampled[strategy]["rf"]["obj"] = model
save_object(resampled[strategy]["rf"]["obj"], f"{BIN_PATH}rf_{strategy}_obj")
print("\n")
# Feature Importance
plot_feature_importances(resampled[strategy]["rf"]["obj"], features, top_n=len(features))
# Weights
plot_weights(resampled[strategy]["rf"]["obj"], features, top_n=len(features))
print("\n\n\n")
>> ONLY_RUS
[RandomForestClassifier] Optimizing: 10/10...
Training and Validation: {'Precision': '97.5 +- 0.2', 'Recall': '97.09 +- 0.18', 'F1': '97.29 +- 0.16', 'ROC_AUC': '99.87 +- 0.04', 'Log_Loss': '0.0706 +- 0.0115'}
Model Test: {'Precision': "84.22, {'0': 99.99, '1': 69.7, '2': 95.14, '3': 72.06}", 'Recall': "98.22, {'0': 97.69, '1': 99.69, '2': 99.25, '3': 96.22}", 'F1': "90.11, {'0': 98.83, '1': 82.04, '2': 97.15, '3': 82.41}", 'ROC_AUC': '99.6', 'Log_Loss': '0.4912'}
| Weight | Feature |
|---|---|
| 0.5057 ± 0.0039 | rank |
| 0.2932 ± 0.0102 | expaned_energy |
| 0.0753 ± 0.0104 | adv_r |
| 0.0604 ± 0.0034 | dist_ch_to_bs |
| 0.0301 ± 0.0032 | who_ch |
| 0.0186 ± 0.0079 | data_s |
| 0.0149 ± 0.0079 | sch_s |
| 0.0006 ± 0.0005 | sch_r |
| 0.0005 ± 0.0007 | send_code |
| 0.0005 ± 0.0007 | join_s |
| 0.0003 ± 0.0006 | dist_to_ch |
>> ROS
[RandomForestClassifier] Optimizing: 10/10...
Training and Validation: {'Precision': '98.29 +- 0.14', 'Recall': '98.09 +- 0.26', 'F1': '98.18 +- 0.18', 'ROC_AUC': '99.91 +- 0.02', 'Log_Loss': '0.0586 +- 0.0094'}
Model Test: {'Precision': "83.36, {'0': 100.0, '1': 67.76, '2': 94.95, '3': 70.75}", 'Recall': "98.31, {'0': 97.48, '1': 99.73, '2': 99.2, '3': 96.83}", 'F1': "89.55, {'0': 98.72, '1': 80.69, '2': 97.03, '3': 81.76}", 'ROC_AUC': '99.58', 'Log_Loss': '0.5211'}
| Weight | Feature |
|---|---|
| 0.4612 ± 0.0042 | rank |
| 0.3071 ± 0.0204 | expaned_energy |
| 0.0995 ± 0.0210 | adv_r |
| 0.0664 ± 0.0035 | dist_ch_to_bs |
| 0.0311 ± 0.0035 | who_ch |
| 0.0206 ± 0.0133 | sch_s |
| 0.0124 ± 0.0119 | data_s |
| 0.0007 ± 0.0004 | sch_r |
| 0.0004 ± 0.0007 | send_code |
| 0.0003 ± 0.0005 | dist_to_ch |
| 0.0003 ± 0.0005 | join_s |
>> SMOTENC
[RandomForestClassifier] Optimizing: 10/10...
Training and Validation: {'Precision': '97.93 +- 0.11', 'Recall': '97.64 +- 0.2', 'F1': '97.78 +- 0.08', 'ROC_AUC': '99.88 +- 0.02', 'Log_Loss': '0.0725 +- 0.0083'}
Model Test: {'Precision': "79.06, {'0': 100.0, '1': 64.8, '2': 93.47, '3': 57.98}", 'Recall': "98.18, {'0': 96.86, '1': 99.73, '2': 99.0, '3': 97.13}", 'F1': "86.43, {'0': 98.4, '1': 78.56, '2': 96.16, '3': 72.61}", 'ROC_AUC': '99.53', 'Log_Loss': '0.5274'}
| Weight | Feature |
|---|---|
| 0.4611 ± 0.0039 | rank |
| 0.3058 ± 0.0218 | expaned_energy |
| 0.0939 ± 0.0215 | adv_r |
| 0.0686 ± 0.0044 | dist_ch_to_bs |
| 0.0366 ± 0.0057 | who_ch |
| 0.0169 ± 0.0035 | data_s |
| 0.0157 ± 0.0052 | sch_s |
| 0.0004 ± 0.0007 | join_s |
| 0.0004 ± 0.0006 | dist_to_ch |
| 0.0003 ± 0.0003 | sch_r |
| 0.0003 ± 0.0005 | send_code |
>> CTGAN
[RandomForestClassifier] Optimizing: 10/10...
Training and Validation: {'Precision': '97.46 +- 0.15', 'Recall': '97.13 +- 0.3', 'F1': '97.29 +- 0.19', 'ROC_AUC': '99.87 +- 0.02', 'Log_Loss': '0.0697 +- 0.0071'}
Model Test: {'Precision': "82.7, {'0': 99.99, '1': 67.19, '2': 94.52, '3': 69.09}", 'Recall': "97.9, {'0': 97.39, '1': 99.76, '2': 99.6, '3': 94.86}", 'F1': "88.98, {'0': 98.68, '1': 80.3, '2': 97.0, '3': 79.95}", 'ROC_AUC': '99.7', 'Log_Loss': '0.487'}
| Weight | Feature |
|---|---|
| 0.4627 ± 0.0035 | rank |
| 0.2551 ± 0.0231 | expaned_energy |
| 0.1174 ± 0.0196 | dist_ch_to_bs |
| 0.0786 ± 0.0034 | adv_r |
| 0.0293 ± 0.0114 | sch_s |
| 0.0277 ± 0.0028 | who_ch |
| 0.0161 ± 0.0073 | data_s |
| 0.0119 ± 0.0039 | dist_to_ch |
| 0.0005 ± 0.0004 | sch_r |
| 0.0003 ± 0.0005 | send_code |
| 0.0003 ± 0.0006 | join_s |
03.5. XGBoost-Boruta¶
In [9]:
for strategy in ["ONLY_RUS", "ROS", "SMOTENC", "CTGAN"]: # For each resampling strategy...
print(f"\n>> {strategy}\n")
# Split into X and y
# Select only Boruta features
X_train, y_train = split_X_y(resampled[strategy]["data"][boruta_features + ["is_target"]], "is_target", [])
# Optimizing hyperparameters...
model, best_hyperparams, _ = HyperParamXGBoostClassifier(
X_train, y_train, n_trials=10).run()
# Cross-Validation with k = 5
resampled[strategy]["xgb_boruta"]["train_metrics"] = train_validate_model(model, X_train, y_train)
# Training model...
model = XGBClassifier(**best_hyperparams, verbosity=0)
model.fit(X_train, y_train)
# Model testing
# Only Boruta features
resampled[strategy]["xgb_boruta"]["test_metrics"] = test_model(model, X_test[boruta_features], y_test)
print("\n")
# ROC and PR Curves
plot_roc_curve(model, X_test[boruta_features], y_test)
print("\n")
plot_pr_curve(model, X_test[boruta_features], y_test)
# Save trained model
resampled[strategy]["xgb_boruta"]["obj"] = model
save_object(resampled[strategy]["xgb_boruta"]["obj"], f"{BIN_PATH}xgb_boruta_{strategy}_obj")
print("\n")
# Feature Importance
plot_feature_importances(resampled[strategy]["xgb_boruta"]["obj"], boruta_features, top_n=len(boruta_features))
# Weights
plot_weights(resampled[strategy]["xgb_boruta"]["obj"], boruta_features, top_n=len(boruta_features))
print("\n\n\n")
>> ONLY_RUS
[XGBClassifier] Optimizing: 10/10...
Training and Validation: {'Precision': '97.34 +- 0.22', 'Recall': '97.5 +- 0.19', 'F1': '97.42 +- 0.18', 'ROC_AUC': '99.92 +- 0.01', 'Log_Loss': '0.0521 +- 0.0038'}
Model Test: {'Precision': "86.33, {'0': 100.0, '1': 65.23, '2': 99.55, '3': 80.56}", 'Recall': "97.98, {'0': 97.57, '1': 99.73, '2': 98.26, '3': 96.37}", 'F1': "91.07, {'0': 98.77, '1': 78.87, '2': 98.9, '3': 87.76}", 'ROC_AUC': '99.93', 'Log_Loss': '0.122'}
| Weight | Feature |
|---|---|
| 0.9250 | rank |
| 0.0309 | data_s |
| 0.0267 | join_s |
| 0.0101 | expaned_energy |
| 0.0033 | adv_r |
| 0.0032 | dist_ch_to_bs |
| 0.0008 | who_ch |
| 0 | send_code |
>> ROS
[XGBClassifier] Optimizing: 10/10...
Training and Validation: {'Precision': '98.38 +- 0.2', 'Recall': '98.49 +- 0.25', 'F1': '98.43 +- 0.21', 'ROC_AUC': '99.94 +- 0.01', 'Log_Loss': '0.0417 +- 0.0038'}
Model Test: {'Precision': "85.53, {'0': 100.0, '1': 65.58, '2': 99.45, '3': 77.09}", 'Recall': "98.04, {'0': 97.54, '1': 99.73, '2': 98.81, '3': 96.07}", 'F1': "90.64, {'0': 98.75, '1': 79.12, '2': 99.13, '3': 85.54}", 'ROC_AUC': '99.93', 'Log_Loss': '0.1302'}
| Weight | Feature |
|---|---|
| 0.9329 | rank |
| 0.0267 | data_s |
| 0.0219 | join_s |
| 0.0106 | expaned_energy |
| 0.0036 | adv_r |
| 0.0035 | dist_ch_to_bs |
| 0.0008 | who_ch |
| 0 | send_code |
>> SMOTENC
[XGBClassifier] Optimizing: 10/10...
Training and Validation: {'Precision': '98.03 +- 0.17', 'Recall': '97.92 +- 0.19', 'F1': '97.97 +- 0.15', 'ROC_AUC': '99.92 +- 0.01', 'Log_Loss': '0.0524 +- 0.0028'}
Model Test: {'Precision': "84.86, {'0': 100.0, '1': 64.62, '2': 98.8, '3': 76.01}", 'Recall': "97.94, {'0': 97.44, '1': 99.25, '2': 98.41, '3': 96.68}", 'F1': "90.17, {'0': 98.7, '1': 78.28, '2': 98.6, '3': 85.11}", 'ROC_AUC': '99.87', 'Log_Loss': '0.1385'}
| Weight | Feature |
|---|---|
| 0.9266 | rank |
| 0.0279 | join_s |
| 0.0271 | data_s |
| 0.0101 | expaned_energy |
| 0.0039 | adv_r |
| 0.0035 | dist_ch_to_bs |
| 0.0009 | who_ch |
| 0 | send_code |
>> CTGAN
[XGBClassifier] Optimizing: 10/10...
Training and Validation: {'Precision': '97.56 +- 0.1', 'Recall': '97.34 +- 0.15', 'F1': '97.45 +- 0.11', 'ROC_AUC': '99.91 +- 0.01', 'Log_Loss': '0.0553 +- 0.0031'}
Model Test: {'Precision': "83.86, {'0': 100.0, '1': 66.34, '2': 99.35, '3': 69.76}", 'Recall': "98.15, {'0': 97.48, '1': 99.59, '2': 99.0, '3': 96.53}", 'F1': "89.63, {'0': 98.72, '1': 79.63, '2': 99.18, '3': 80.99}", 'ROC_AUC': '99.92', 'Log_Loss': '0.1301'}
| Weight | Feature |
|---|---|
| 0.9269 | rank |
| 0.0290 | data_s |
| 0.0281 | join_s |
| 0.0076 | expaned_energy |
| 0.0045 | dist_ch_to_bs |
| 0.0030 | adv_r |
| 0.0009 | who_ch |
| 0 | send_code |
04. Synthesized Results¶
In [10]:
results = [] # List to save results
for strategy, models in resampled.items(): # Iterate on strategies and models
for model_name, model_metrics in models.items():
if model_name not in ["data"]:
# Add the metrics dictionaries
results.append({"strategy": strategy, "model": model_name.upper(),
"train_metrics": model_metrics.get("train_metrics", {}),
"test_metrics": model_metrics.get("test_metrics", {})})
# DataFrame from list
results_df = pd.DataFrame(results)
# Results...
display(results_df)
| strategy | model | train_metrics | test_metrics | |
|---|---|---|---|---|
| 0 | ONLY_RUS | RF | {'Precision': '97.5 +- 0.2', 'Recall': '97.09 +- 0.18', 'F1': '97.29 +- 0.16', 'ROC_AUC': '99.87 +- 0.04', 'Log_Loss': '0.0706 +- 0.0115'} | {'Precision': '84.22, {'0': 99.99, '1': 69.7, '2': 95.14, '3': 72.06}', 'Recall': '98.22, {'0': 97.69, '1': 99.69, '2': 99.25, '3': 96.22}', 'F1':... |
| 1 | ONLY_RUS | XGB | {'Precision': '97.65 +- 0.25', 'Recall': '97.73 +- 0.23', 'F1': '97.69 +- 0.23', 'ROC_AUC': '99.93 +- 0.01', 'Log_Loss': '0.0482 +- 0.0039'} | {'Precision': '88.16, {'0': 99.99, '1': 67.03, '2': 99.5, '3': 86.12}', 'Recall': '98.22, {'0': 97.8, '1': 99.76, '2': 98.81, '3': 96.53}', 'F1': ... |
| 2 | ONLY_RUS | LGB | {'Precision': '97.75 +- 0.15', 'Recall': '97.9 +- 0.26', 'F1': '97.82 +- 0.19', 'ROC_AUC': '99.94 +- 0.01', 'Log_Loss': '0.0446 +- 0.0037'} | {'Precision': '87.38, {'0': 100.0, '1': 67.24, '2': 99.4, '3': 82.9}', 'Recall': '98.28, {'0': 97.78, '1': 99.69, '2': 98.96, '3': 96.68}', 'F1': ... |
| 3 | ONLY_RUS | CATBOOST | {'Precision': '97.1 +- 0.15', 'Recall': '97.15 +- 0.23', 'F1': '97.12 +- 0.18', 'ROC_AUC': '99.9 +- 0.01', 'Log_Loss': '0.0584 +- 0.0031'} | {'Precision': '81.04, {'0': 100.0, '1': 67.88, '2': 92.49, '3': 63.79}', 'Recall': '98.23, {'0': 97.25, '1': 99.76, '2': 99.85, '3': 96.07}', 'F1'... |
| 4 | ONLY_RUS | XGB_BORUTA | {'Precision': '97.34 +- 0.22', 'Recall': '97.5 +- 0.19', 'F1': '97.42 +- 0.18', 'ROC_AUC': '99.92 +- 0.01', 'Log_Loss': '0.0521 +- 0.0038'} | {'Precision': '86.33, {'0': 100.0, '1': 65.23, '2': 99.55, '3': 80.56}', 'Recall': '97.98, {'0': 97.57, '1': 99.73, '2': 98.26, '3': 96.37}', 'F1'... |
| 5 | ROS | RF | {'Precision': '98.29 +- 0.14', 'Recall': '98.09 +- 0.26', 'F1': '98.18 +- 0.18', 'ROC_AUC': '99.91 +- 0.02', 'Log_Loss': '0.0586 +- 0.0094'} | {'Precision': '83.36, {'0': 100.0, '1': 67.76, '2': 94.95, '3': 70.75}', 'Recall': '98.31, {'0': 97.48, '1': 99.73, '2': 99.2, '3': 96.83}', 'F1':... |
| 6 | ROS | XGB | {'Precision': '98.53 +- 0.15', 'Recall': '98.56 +- 0.22', 'F1': '98.54 +- 0.18', 'ROC_AUC': '99.95 +- 0.01', 'Log_Loss': '0.0391 +- 0.0044'} | {'Precision': '84.96, {'0': 99.99, '1': 66.77, '2': 99.5, '3': 73.59}', 'Recall': '98.05, {'0': 97.6, '1': 99.76, '2': 98.46, '3': 96.37}', 'F1': ... |
| 7 | ROS | LGB | {'Precision': '98.57 +- 0.17', 'Recall': '98.6 +- 0.17', 'F1': '98.58 +- 0.15', 'ROC_AUC': '99.95 +- 0.01', 'Log_Loss': '0.0383 +- 0.0031'} | {'Precision': '87.05, {'0': 100.0, '1': 66.71, '2': 99.5, '3': 81.99}', 'Recall': '98.34, {'0': 97.71, '1': 99.76, '2': 98.91, '3': 96.98}', 'F1':... |
| 8 | ROS | CATBOOST | {'Precision': '97.96 +- 0.1', 'Recall': '97.96 +- 0.17', 'F1': '97.95 +- 0.11', 'ROC_AUC': '99.93 +- 0.01', 'Log_Loss': '0.054 +- 0.0031'} | {'Precision': '80.74, {'0': 100.0, '1': 66.41, '2': 92.1, '3': 64.46}', 'Recall': '98.46, {'0': 97.1, '1': 99.76, '2': 99.7, '3': 97.28}', 'F1': '... |
| 9 | ROS | XGB_BORUTA | {'Precision': '98.38 +- 0.2', 'Recall': '98.49 +- 0.25', 'F1': '98.43 +- 0.21', 'ROC_AUC': '99.94 +- 0.01', 'Log_Loss': '0.0417 +- 0.0038'} | {'Precision': '85.53, {'0': 100.0, '1': 65.58, '2': 99.45, '3': 77.09}', 'Recall': '98.04, {'0': 97.54, '1': 99.73, '2': 98.81, '3': 96.07}', 'F1'... |
| 10 | SMOTENC | RF | {'Precision': '97.93 +- 0.11', 'Recall': '97.64 +- 0.2', 'F1': '97.78 +- 0.08', 'ROC_AUC': '99.88 +- 0.02', 'Log_Loss': '0.0725 +- 0.0083'} | {'Precision': '79.06, {'0': 100.0, '1': 64.8, '2': 93.47, '3': 57.98}', 'Recall': '98.18, {'0': 96.86, '1': 99.73, '2': 99.0, '3': 97.13}', 'F1': ... |
| 11 | SMOTENC | XGB | {'Precision': '98.06 +- 0.19', 'Recall': '97.97 +- 0.18', 'F1': '98.01 +- 0.14', 'ROC_AUC': '99.93 +- 0.01', 'Log_Loss': '0.0494 +- 0.0031'} | {'Precision': '83.75, {'0': 100.0, '1': 65.15, '2': 98.76, '3': 71.08}', 'Recall': '98.31, {'0': 97.38, '1': 99.25, '2': 99.35, '3': 97.28}', 'F1'... |
| 12 | SMOTENC | LGB | {'Precision': '98.14 +- 0.11', 'Recall': '98.12 +- 0.2', 'F1': '98.13 +- 0.12', 'ROC_AUC': '99.93 +- 0.01', 'Log_Loss': '0.0468 +- 0.0037'} | {'Precision': '84.82, {'0': 100.0, '1': 63.72, '2': 98.81, '3': 76.73}', 'Recall': '98.28, {'0': 97.33, '1': 99.28, '2': 99.4, '3': 97.13}', 'F1':... |
| 13 | SMOTENC | CATBOOST | {'Precision': '97.6 +- 0.14', 'Recall': '97.56 +- 0.28', 'F1': '97.57 +- 0.17', 'ROC_AUC': '99.91 +- 0.01', 'Log_Loss': '0.0598 +- 0.0024'} | {'Precision': '77.76, {'0': 100.0, '1': 64.76, '2': 91.69, '3': 54.59}', 'Recall': '98.18, {'0': 96.68, '1': 99.73, '2': 99.35, '3': 96.98}', 'F1'... |
| 14 | SMOTENC | XGB_BORUTA | {'Precision': '98.03 +- 0.17', 'Recall': '97.92 +- 0.19', 'F1': '97.97 +- 0.15', 'ROC_AUC': '99.92 +- 0.01', 'Log_Loss': '0.0524 +- 0.0028'} | {'Precision': '84.86, {'0': 100.0, '1': 64.62, '2': 98.8, '3': 76.01}', 'Recall': '97.94, {'0': 97.44, '1': 99.25, '2': 98.41, '3': 96.68}', 'F1':... |
| 15 | CTGAN | RF | {'Precision': '97.46 +- 0.15', 'Recall': '97.13 +- 0.3', 'F1': '97.29 +- 0.19', 'ROC_AUC': '99.87 +- 0.02', 'Log_Loss': '0.0697 +- 0.0071'} | {'Precision': '82.7, {'0': 99.99, '1': 67.19, '2': 94.52, '3': 69.09}', 'Recall': '97.9, {'0': 97.39, '1': 99.76, '2': 99.6, '3': 94.86}', 'F1': '... |
| 16 | CTGAN | XGB | {'Precision': '97.93 +- 0.14', 'Recall': '97.74 +- 0.21', 'F1': '97.83 +- 0.16', 'ROC_AUC': '99.93 +- 0.01', 'Log_Loss': '0.0471 +- 0.0036'} | {'Precision': '87.15, {'0': 99.99, '1': 67.5, '2': 99.45, '3': 81.67}', 'Recall': '98.16, {'0': 97.79, '1': 99.76, '2': 98.86, '3': 96.22}', 'F1':... |
| 17 | CTGAN | LGB | {'Precision': '98.0 +- 0.13', 'Recall': '98.01 +- 0.2', 'F1': '98.0 +- 0.13', 'ROC_AUC': '99.94 +- 0.01', 'Log_Loss': '0.043 +- 0.0027'} | {'Precision': '86.24, {'0': 100.0, '1': 67.28, '2': 99.4, '3': 78.27}', 'Recall': '98.3, {'0': 97.71, '1': 99.69, '2': 98.96, '3': 96.83}', 'F1': ... |
| 18 | CTGAN | CATBOOST | {'Precision': '97.52 +- 0.1', 'Recall': '97.35 +- 0.26', 'F1': '97.43 +- 0.18', 'ROC_AUC': '99.91 +- 0.01', 'Log_Loss': '0.0573 +- 0.0026'} | {'Precision': '79.57, {'0': 100.0, '1': 70.04, '2': 89.69, '3': 58.57}', 'Recall': '97.98, {'0': 97.24, '1': 99.73, '2': 99.95, '3': 95.02}', 'F1'... |
| 19 | CTGAN | XGB_BORUTA | {'Precision': '97.56 +- 0.1', 'Recall': '97.34 +- 0.15', 'F1': '97.45 +- 0.11', 'ROC_AUC': '99.91 +- 0.01', 'Log_Loss': '0.0553 +- 0.0031'} | {'Precision': '83.86, {'0': 100.0, '1': 66.34, '2': 99.35, '3': 69.76}', 'Recall': '98.15, {'0': 97.48, '1': 99.59, '2': 99.0, '3': 96.53}', 'F1':... |
In [11]:
# Save results DataFrame
results_df.to_csv(f"{REPORTS_PATH}training_results.csv", index=False)
In [ ]: